#we upload the dataset
total_500 <- read.csv("~/GitHub/thesis_msc_business_analytics/Python/total_500_new.csv", sep=";", na.strings="n/a")
#we see how many observations and how many variables we have
dim(total_500)
## [1] 500 730
#We create a subset to make some changes to the data
total_500_sub <- total_500
#Change the decimal point for the 4 variables
total_500_sub$Assets.. <- gsub(",", ".", total_500_sub$Assets.. )
total_500_sub$Market.value.. <- gsub(",", ".", total_500_sub$Market.value.. )
total_500_sub$Revenues.. <- gsub(",", ".", total_500_sub$Revenues.. )
total_500_sub$Total.Stockholder.Equity.. <- gsub(",", ".", total_500_sub$Total.Stockholder.Equity.. )
#Make the variables numeric
for(i in 1:18){
total_500_sub[,i] <- as.numeric(total_500_sub[,i])}
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
for(i in 20:730){
total_500_sub[,i] <- as.numeric(total_500_sub[,i])}
#We omit the nas from the analysis
total_500_final <- na.omit(total_500_sub)
#We rename variable X as Ranking
colnames(total_500_final)[1] <- "Ranking"
#Change the names of some variables to be more easily readable
colnames(total_500_final)[2] <- "Assets"
colnames(total_500_final)[3] <- "Market_Value"
colnames(total_500_final)[4] <- "Revenues"
colnames(total_500_final)[6] <- "Total_SH_Equity"
#Delete the variables we will not need
total_500_final$Revenues...1 <- NULL #Revenues %
total_500_final$company <- NULL #company name
total_500_final$url<- NULL # company url
#we upload the libraries beneath that we will use in the analysis
library(ggplot2)
library(reshape2)
library(DAAG)
## Loading required package: lattice
#Final number of observation and variables we will use
dim(total_500_final)
## [1] 408 727
#######################################################################################################
#we first see the summary of the Fortune variables and then we create their histogram so as to have a
#good grasp of how they are distributed
ggplot(data=total_500_final,aes(x=Revenues))+geom_histogram(binwidth=50, colour = "green", fill ="darkgreen")

ggplot(data=total_500_final,aes(x=Assets))+geom_histogram(binwidth=100, colour = "red", fill ="darkred")

ggplot(data=total_500_final,aes(x=Market_Value))+geom_histogram(binwidth=100, colour = "blue", fill ="darkblue")

ggplot(data=total_500_final,aes(x=Total_SH_Equity))+geom_histogram(binwidth=100, colour = "purple", fill ="pink")

###############################################################################################
#We make plots to see how the variables we got from Fortune 500 are related with the Ranking
ggplot(total_500_final, aes(Assets,Ranking)) + geom_point(colour = "red")

ggplot(total_500_final, aes(Market_Value, Ranking)) + geom_point(colour = "blue")

ggplot(total_500_final, aes(Total_SH_Equity, Ranking)) + geom_point(colour = "purple")

ggplot(total_500_final, aes(Revenues, Ranking)) + geom_point(colour = "green")

#We can see that the Ranking has a linear relationship with the Revenues so we will use one of those 2 variables to check the relationships with the websites metrics
#In order to have a more clear look we also create a correlation diagram
total_500_fortune <- total_500_final[,c(1:5)]
library(corrplot)
library(caret)
sm <- cor(total_500_fortune)
sm
## Ranking Assets Market_Value Revenues
## Ranking 1.0000000 -0.36673307 -0.15959008 -0.67511457
## Assets -0.3667331 1.00000000 0.16787320 0.43479882
## Market_Value -0.1595901 0.16787320 1.00000000 0.31085660
## Revenues -0.6751146 0.43479882 0.31085660 1.00000000
## Total_SH_Equity 0.1327272 -0.03638159 -0.02912268 -0.05616772
## Total_SH_Equity
## Ranking 0.13272724
## Assets -0.03638159
## Market_Value -0.02912268
## Revenues -0.05616772
## Total_SH_Equity 1.00000000
corrplot(cor(total_500_fortune),method="number")

#From this plot we understand that the Ranking and the Revenues have very high correlation.
##########################################################################################################
#Firstly we will analyze the social media relevance with the sites.
#We will see how many of the sites have social media and what type of social media
#Facebook
social_media_facebook <- round(table(total_500_final$facebook)/408,3)
social_media_facebook
##
## 0 1
## 0.353 0.647
slicelable <- c(paste(35.3,"% no"),paste(64.7,"% yes"))
pie(social_media_facebook,label = slicelable,main="Share of companies with Facebook",col=rainbow(length(social_media_facebook)))

ggplot(total_500_final, aes(Revenues, facebook)) + geom_point(size=3, colour = "darkblue")

#Twitter
social_media_twitter <- round(table(total_500_final$twitter)/408,3)
social_media_twitter
##
## 0 1
## 0.314 0.686
slicelable <- c(paste(31.4,"% no"),paste(68.6,"% yes"))
pie(social_media_twitter,label = slicelable,main="Share of companies with Twitter",col=rainbow(length(social_media_twitter)))

ggplot(total_500_final, aes(Revenues, twitter)) + geom_point(size=3, colour = "darkgreen")

#Instagram
social_media_instagram <- round(table(total_500_final$instagram)/408,3)
social_media_instagram
##
## 0 1
## 0.777 0.223
slicelable <- c(paste(77.7,"% no"),paste(22.3,"% yes"))
pie(social_media_instagram,label = slicelable,main="Share of companies with Instagram",col=rainbow(length(social_media_instagram)))

ggplot(total_500_final, aes(Revenues, instagram)) + geom_point(size=3, colour = "pink")

#Pinterest
social_media_pinterest <- round(table(total_500_final$pinterest)/408,3)
social_media_pinterest
##
## 0 1
## 0.902 0.098
slicelable <- c(paste(90.2,"% no"),paste(9.8,"% yes"))
pie(social_media_pinterest,label = slicelable,main="Share of companies with Pinterest",col=rainbow(length(social_media_pinterest)))

ggplot(total_500_final, aes(Revenues, pinterest)) + geom_point(size=3, colour = "darkred")

#Youtube
social_media_youtube <- round(table(total_500_final$youtube)/408,3)
social_media_youtube
##
## 0 1
## 0.417 0.583
slicelable <- c(paste(41.7,"% no"),paste(58.3,"% yes"))
pie(social_media_youtube,label = slicelable,main="Share of companies with Youtube",col=rainbow(length(social_media_youtube)))

ggplot(total_500_final, aes(Revenues, youtube)) + geom_point(size=3, colour = "red")

#LinkedIn
social_media_linkedin <- round(table(total_500_final$linkedin)/408,3)
social_media_linkedin
##
## 0 1
## 0.429 0.571
slicelable <- c(paste(42.9,"% no"),paste(57.1,"% yes"))
pie(social_media_linkedin,label = slicelable,main="Share of companies with Linkedin",col=rainbow(length(social_media_linkedin)))

ggplot(total_500_final, aes(Revenues, linkedin)) + geom_point(size=3, colour = "blue")

#And we can also see for correlations
total_500_social_media <- total_500_final[,c(4,10:15)]
library(corrplot)
library(caret)
sm <- cor(total_500_social_media)
sm
## Revenues facebook instagram linkedin pinterest
## Revenues 1.000000000 0.01121852 0.05771665 -0.008311532 0.09686843
## facebook 0.011218524 1.00000000 0.35874256 0.520581725 0.24349238
## instagram 0.057716654 0.35874256 1.00000000 0.143134960 0.37774489
## linkedin -0.008311532 0.52058172 0.14313496 1.000000000 -0.03069495
## pinterest 0.096868426 0.24349238 0.37774489 -0.030694951 1.00000000
## twitter 0.002185367 0.67230226 0.32419034 0.577378625 0.20514804
## youtube 0.074833925 0.54096275 0.32145351 0.482997415 0.19504737
## twitter youtube
## Revenues 0.002185367 0.07483393
## facebook 0.672302259 0.54096275
## instagram 0.324190344 0.32145351
## linkedin 0.577378625 0.48299741
## pinterest 0.205148042 0.19504737
## twitter 1.000000000 0.52142857
## youtube 0.521428571 1.00000000
corrplot(cor(total_500_social_media),method="number")

#we see that facebook has correlation more than 50% with twitter, youtube and linkedin
#And that the smallest correlations are those of pinterest and instagram
#########################################################################################################
#We will now check the links by creating an histogram
#Then we create ggplots in order to see in what frequency the links appear
par(mfrow=c(1,1))
library(ggplot2)
ggplot(data=total_500_final,aes(x=total.links))+geom_histogram(binwidth=50, colour = "darkblue", fill ="blue")

ggplot(total_500_final, aes(Revenues, total.links)) + geom_point(size=3, colour = "darkblue")

ggplot(data=total_500_final,aes(x=external))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, external)) + geom_point(size=3, colour = "darkred")

ggplot(data=total_500_final,aes(x=internal))+geom_histogram(binwidth=50, colour = "darkgreen", fill ="green")

ggplot(total_500_final, aes(Revenues, internal)) + geom_point(size=3, colour = "darkgreen")

#And we can also see for correlations
total_500_links <- total_500_final[,c(4,21:23)]
library(corrplot)
library(caret)
tl <- cor(total_500_links)
tl
## Revenues external internal total.links
## Revenues 1.00000000 0.034100506 0.004559950 0.01538199
## external 0.03410051 1.000000000 -0.002593961 0.32202419
## internal 0.00455995 -0.002593961 1.000000000 0.94589294
## total.links 0.01538199 0.322024191 0.945892937 1.00000000
corrplot(cor(total_500_links),method="number")

#We can see that the total links with the internal links have a correlation almost 95%.
#So we will not include the total links in the regression model
#########################################################################################################
#Now we will see the loading time per site
ggplot(data=total_500_final,aes(x=loading.time))+geom_histogram(binwidth=1, colour = "pink", fill ="purple")

ggplot(total_500_final, aes(Revenues, loading.time)) + geom_point(size=3, colour = "purple")

#########################################################################################################
#Now we will see the total words, the unique words and the sentences how are distributed alone and in relationhsip with the revenues.
ggplot(data=total_500_final,aes(x=Sentences))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, Sentences)) + geom_point(size=3, colour = "purple")

#########################
ggplot(data=total_500_final,aes(x=Unique.words))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, Unique.words)) + geom_point(size=3, colour = "purple")

#########################
ggplot(data=total_500_final,aes(x=Words))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, Words)) + geom_point(size=3, colour = "purple")

#############################
#And we can also see for correlations
total_500_lt_w <- total_500_final[,c(4,18:20,727)]
library(corrplot)
library(caret)
tl <- cor(total_500_lt_w)
tl
## Revenues Sentences Unique.words Words loading.time
## Revenues 1.00000000 -0.01183819 -0.04362118 -0.03479049 -0.1212650
## Sentences -0.01183819 1.00000000 0.69454327 0.78851979 0.1497520
## Unique.words -0.04362118 0.69454327 1.00000000 0.93243940 0.1994296
## Words -0.03479049 0.78851979 0.93243940 1.00000000 0.1857922
## loading.time -0.12126500 0.14975205 0.19942956 0.18579225 1.0000000
corrplot(cor(total_500_lt_w),method="number")

################################
#Next we will check the Flesh Measure alone and in relationship with revenues
ggplot(data=total_500_final,aes(x=Flesh_Mesaure))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, Flesh_Mesaure)) + geom_point(size=3, colour = "purple")

############################
total_500_final$Readability <- gsub("Very easy", "01_VE", total_500_final$Readability )
total_500_final$Readability <- gsub("Easy", "02_E", total_500_final$Readability )
total_500_final$Readability <- gsub("Fairly easy", "03_FE", total_500_final$Readability )
total_500_final$Readability <- gsub("Standard", "04_St", total_500_final$Readability )
total_500_final$Readability <- gsub("Fairly difficult", "05_FD", total_500_final$Readability )
total_500_final$Readability <- gsub("Difficult", "06_D", total_500_final$Readability )
total_500_final$Readability <- gsub("Very Confusing", "07_VC", total_500_final$Readability )
barplot(table(total_500_final$Readability),col ="dark red")

total_500_final$Readability <- gsub("01_VE","1", total_500_final$Readability )
total_500_final$Readability <- gsub("02_E", "2", total_500_final$Readability )
total_500_final$Readability <- gsub("03_FE", "3", total_500_final$Readability )
total_500_final$Readability <- gsub("04_St", "4", total_500_final$Readability )
total_500_final$Readability <- gsub("05_FD", "5", total_500_final$Readability )
total_500_final$Readability <- gsub("06_D", "6" ,total_500_final$Readability )
total_500_final$Readability <- gsub("07_VC", "7",total_500_final$Readability )
total_500_final$Readability <- as.numeric(total_500_final$Readability )
ggplot(data=total_500_final,aes(x=Readability))+geom_bar(binwidth=1, colour = "darkred", fill ="red")
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(total_500_final, aes(Revenues, Readability)) + geom_point(size=3, colour = "purple")

#And we can also see for correlations
total_500_r <- total_500_final[,c(4,16,17)]
library(corrplot)
library(caret)
tl <- cor(total_500_r)
tl
## Revenues Flesh_Mesaure Readability
## Revenues 1.00000000 0.02476229 -0.02694931
## Flesh_Mesaure 0.02476229 1.00000000 -0.17094994
## Readability -0.02694931 -0.17094994 1.00000000
corrplot(cor(total_500_r),method="number")

#########################################################################################################
#Now we will see the number of errors and warnings alone and in relationship with the Revenues
ggplot(data=total_500_final,aes(x=number_of_errors))+geom_histogram(binwidth=50, colour = "red")

ggplot(total_500_final, aes(Revenues, number_of_errors)) + geom_point(size=3, colour = "dark red")

ggplot(data=total_500_final,aes(x=number_of_warning))+geom_histogram(binwidth=20, colour = "red")

ggplot(total_500_final, aes(Revenues, number_of_warning)) + geom_point(size=3, colour = "dark blue")

#########################################################################################################
#########################################################################################################
#Now we will see the non.document.error and the page not opened variables alone and in relationship with the Revenues
ggplot(data=total_500_final,aes(x=non.document.error))+geom_histogram(binwidth=1, colour = "red")

ggplot(total_500_final, aes(Revenues, non.document.error)) + geom_point(size=1, colour = "dark red")

ggplot(data=total_500_final,aes(x=The_page_opened))+geom_histogram(binwidth=1, colour = "red")

ggplot(total_500_final, aes(Revenues, The_page_opened)) + geom_point(size=3, colour = "dark blue")

#In the page not opened we can see that the variable has only the price 1 that means that the page opened so there is no point in using it in the analysis as it does not affect the outcome
#########################################################################################################
#And we can also see for correlations
total_500_html <- total_500_final[,c(4,7:9)]
library(corrplot)
library(caret)
tl <- cor(total_500_html)
tl
## Revenues non.document.error number_of_errors
## Revenues 1.00000000 -0.0748407 0.0800205
## non.document.error -0.07484070 1.0000000 -0.2545301
## number_of_errors 0.08002050 -0.2545301 1.0000000
## number_of_warning 0.09505013 -0.2242315 0.2309578
## number_of_warning
## Revenues 0.09505013
## non.document.error -0.22423152
## number_of_errors 0.23095778
## number_of_warning 1.00000000
corrplot(cor(total_500_html),method="number")

#Now we will see the total images alone and in relationship with the revenues
ggplot(data=total_500_final,aes(x=total.images))+geom_histogram(binwidth=100, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, total.images)) + geom_point(size=3, colour = "dark blue")

#########################################################################################################
#We will see now the frequency of image types that is being used
par(mfrow=c(1,1))
k = c(717:725)
for(i in 1:9){
a <- k[i]
image_type<- round(table(total_500_final[,a])/408,3)
barplot(image_type,xlab=names(total_500_final)[a],ylab = "Shares of images per site", col = "dark green")}









#It is obvious that the most common images type are .jpg, gif and .png
#We will check now the types in relationship with the revenues
ggplot(total_500_final, aes(Revenues, .bmp)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .dib)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .gif)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .jpe)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .jpeg)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .jpg)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .png)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .tif)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .tiff)) + geom_point(size=3, colour = "dark blue")

#And we can also see for correlations
total_500_im<- total_500_final[,c(4,717:726)]
library(corrplot)
library(caret)
tl <- cor(total_500_im)
tl
## Revenues .bmp .dib .gif
## Revenues 1.000000000 0.083489281 0.0877047067 -0.020930575
## .bmp 0.083489281 1.000000000 -0.0013011275 -0.005780172
## .dib 0.087704707 -0.001301127 1.0000000000 0.196433219
## .gif -0.020930575 -0.005780172 0.1964332192 1.000000000
## .jpe 0.059660288 -0.003534780 0.9108504455 0.235836802
## .jpeg 0.059427022 -0.003482535 0.9108963639 0.236016392
## .jpg 0.004515870 0.013995542 0.0081915788 -0.006939606
## .png 0.022053238 -0.001301001 0.2204575537 0.164538688
## .tif -0.002466003 0.050383371 0.0628334455 0.031167772
## .tiff -0.030165817 -0.005659409 -0.0008571371 0.007252077
## total.images 0.051605339 0.018395189 0.7961253175 0.319131935
## .jpe .jpeg .jpg .png
## Revenues 0.059660288 0.059427022 0.004515870 0.022053238
## .bmp -0.003534780 -0.003482535 0.013995542 -0.001301001
## .dib 0.910850445 0.910896364 0.008191579 0.220457554
## .gif 0.235836802 0.236016392 -0.006939606 0.164538688
## .jpe 1.000000000 0.999991326 -0.008220505 0.231367993
## .jpeg 0.999991326 1.000000000 -0.008242305 0.231422560
## .jpg -0.008220505 -0.008242305 1.000000000 0.244033499
## .png 0.231367993 0.231422560 0.244033499 1.000000000
## .tif 0.007431086 0.007587510 0.375321187 0.259001712
## .tiff -0.005630343 -0.005548159 0.224429140 0.040635288
## total.images 0.855175392 0.855225420 0.413319367 0.529706228
## .tif .tiff total.images
## Revenues -0.002466003 -0.0301658169 0.05160534
## .bmp 0.050383371 -0.0056594093 0.01839519
## .dib 0.062833445 -0.0008571371 0.79612532
## .gif 0.031167772 0.0072520772 0.31913194
## .jpe 0.007431086 -0.0056303426 0.85517539
## .jpeg 0.007587510 -0.0055481589 0.85522542
## .jpg 0.375321187 0.2244291400 0.41331937
## .png 0.259001712 0.0406352880 0.52970623
## .tif 1.000000000 0.0222123897 0.34887215
## .tiff 0.022212390 1.0000000000 0.07827891
## total.images 0.348872154 0.0782789113 1.00000000
corrplot(cor(total_500_im),method="number")

#We will see now the frequency of image sizes that is being used
k = c()
#Check for sizes that are half and half divided in existing and not
for(i in 24:716){
image_size<- round(table(total_500_final[,i]))
if ((image_size[[1]]==408)==TRUE){
k <- union(k, c(i))
}}
#####################
#Number 24 is all onw price so we want use it
names(total_500_final)[24]
## [1] "X144x144"
total_500_final$X144x144 <- NULL
false_not_existing = c()
#Check for sizes that are less than half divided in existing and not
for(i in 24:715){
image_size<- round(table(total_500_final[,i]))
if ((image_size[[2]]<204)==TRUE){
false_not_existing <- union(false_not_existing, c(i))
}}
########################
#Now we will take the sizes that exist in less than half the instances and check graphically the deviations between the 408 sites
par(mfrow=c(3,3))
for(i in 1:416){
a = false_not_existing[i]
plot(total_500_final[,a],total_500_final$Revenues)
image_size<- round(table(total_500_final[,a]))
barplot(image_size,xlab=names(total_500_final)[a],ylab = "Has or not the size", col = "dark green")}





























































































true_existing = c()
#Check for sizes that are more than half divided in existing and not
for(i in 24:715){
image_size<- round(table(total_500_final[,i]))
if ((image_size[[2]]>204)==TRUE){
true_existing <- union(true_existing, c(i))
}}
#Now we will take the sizes that exist in more than half the instances and check graphically the deviations between the 408 sites
par(mfrow=c(3,3))
for(i in 1:276){
a = true_existing[i]
image_size<- round(table(total_500_final[,a]))
plot(total_500_final[,a],total_500_final$Revenues)
barplot(image_size,xlab=names(total_500_final)[a],ylab = "Has or not the size", col = "dark green")}






























































#By checking the above plots we can see that the 24 first sizes do appear to have some differentiation regarding the revenues. While most sites do have those sizes when it comes to the high revienues they do not have them
par(mfrow=c(3,3))
keep = c()
for(i in 1:24){
a = true_existing[i]
keep = union (keep, c(a))}
keep
## [1] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## [24] 47
#As we can see they are the variables from 24 to 47 and these are the only sizes we are going to keep for the further analysis
total_500_final <- total_500_final[,-c(48:715)]
#Also we remove the other Fortune 500 variables since they will interfer in the outcome of the model and we keep only the variable we want to examine the Revenues
total_500_final$Market_Value <- NULL
total_500_final$Assets <- NULL
total_500_final$Ranking <- NULL
total_500_final$Total_SH_Equity <- NULL
total_500_final$The_page_opened <- NULL
summary(total_500_final)
## Revenues non.document.error number_of_errors number_of_warning
## Min. : 5.130 Min. :0.0000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 7.047 1st Qu.:0.0000 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 11.118 Median :0.0000 Median : 13.00 Median : 3.000
## Mean : 22.244 Mean :0.2451 Mean : 37.36 Mean : 8.669
## 3rd Qu.: 20.858 3rd Qu.:0.0000 3rd Qu.: 37.00 3rd Qu.: 9.000
## Max. :233.715 Max. :1.0000 Max. :995.00 Max. :214.000
## facebook instagram linkedin pinterest
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :1.0000 Median :0.000 Median :1.0000 Median :0.00000
## Mean :0.6471 Mean :0.223 Mean :0.5711 Mean :0.09804
## 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.00000
## twitter youtube Flesh_Mesaure Readability
## Min. :0.0000 Min. :0.0000 Min. :-3422.40 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 34.85 1st Qu.:5.000
## Median :1.0000 Median :1.0000 Median : 45.55 Median :6.000
## Mean :0.6863 Mean :0.5833 Mean : 35.37 Mean :5.517
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 55.40 3rd Qu.:6.000
## Max. :1.0000 Max. :1.0000 Max. : 121.20 Max. :7.000
## Sentences Unique.words Words external
## Min. : 1.0 Min. : 0.0 Min. : 1.0 Min. : 0.00
## 1st Qu.: 73.5 1st Qu.: 59.0 1st Qu.: 287.0 1st Qu.: 2.00
## Median : 139.5 Median : 113.0 Median : 504.5 Median : 5.00
## Mean : 178.9 Mean : 152.3 Mean : 692.1 Mean : 18.00
## 3rd Qu.: 243.5 3rd Qu.: 191.5 3rd Qu.: 905.5 3rd Qu.: 13.25
## Max. :1350.0 Max. :1910.0 Max. :8306.0 Max. :545.00
## internal total.links X15x75 X8x15
## Min. : 0.0 Min. : 0.0 Min. :1.000 Min. :1.000
## 1st Qu.: 73.0 1st Qu.: 81.0 1st Qu.:2.000 1st Qu.:2.000
## Median : 117.0 Median : 134.5 Median :2.000 Median :2.000
## Mean : 154.9 Mean : 172.9 Mean :1.998 Mean :1.995
## 3rd Qu.: 183.0 3rd Qu.: 212.2 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :1254.0 Max. :1255.0 Max. :2.000 Max. :2.000
## X44x556 X1x1 X800x1200 autox100.
## Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:2.000
## Median :2.000 Median :2.000 Median :2.00 Median :2.000
## Mean :1.993 Mean :1.993 Mean :1.99 Mean :1.985
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.00 3rd Qu.:2.000
## Max. :2.000 Max. :2.000 Max. :2.00 Max. :2.000
## X24pxx133px X21pxx173px X46x214 X49x49
## Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.00
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:2.00
## Median :2.000 Median :2.000 Median :2.00 Median :2.00
## Mean :1.983 Mean :1.983 Mean :1.98 Mean :1.98
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.00 3rd Qu.:2.00
## Max. :2.000 Max. :2.000 Max. :2.00 Max. :2.00
## X50x45 X400x300 X292pxx292px X200pxx200px
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :2.000 Median :2.000 Median :2.000
## Mean :1.975 Mean :1.973 Mean :1.968 Mean :1.968
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :2.000 Max. :2.000 Max. :2.000 Max. :2.000
## X1279pxx984px X300pxx1500px X29x29 X115x223
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :2.000 Median :2.000 Median :2.000
## Mean :1.968 Mean :1.968 Mean :1.961 Mean :1.951
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :2.000 Max. :2.000 Max. :2.000 Max. :2.000
## X160x233 X300x993 X41x192 X28x221
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :2.000 Median :2.000 Median :2.000
## Mean :1.951 Mean :1.951 Mean :1.951 Mean :1.951
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :2.000 Max. :2.000 Max. :2.000 Max. :2.000
## X15x12 X60x60 .bmp .dib
## Min. :1.000 Min. :1.000 Min. : 0.00000 Min. : 0.0000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 0.00000 1st Qu.: 0.0000
## Median :2.000 Median :2.000 Median : 0.00000 Median : 0.0000
## Mean :1.949 Mean :1.946 Mean : 0.06863 Mean : 0.1838
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :2.000 Max. :2.000 Max. :23.00000 Max. :35.0000
## .gif .jpe .jpeg .jpg
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 2.00
## Median : 1.000 Median : 0.000 Median : 0.000 Median : 8.50
## Mean : 4.081 Mean : 2.863 Mean : 2.821 Mean : 18.09
## 3rd Qu.: 3.000 3rd Qu.: 0.000 3rd Qu.: 0.000 3rd Qu.: 15.25
## Max. :143.000 Max. :968.000 Max. :968.000 Max. :363.00
## .png .tif .tiff total.images
## Min. : 0.00 Min. : 0.000 Min. :0.00000 Min. : 0.00
## 1st Qu.: 3.00 1st Qu.: 0.000 1st Qu.:0.00000 1st Qu.: 13.00
## Median : 8.00 Median : 0.000 Median :0.00000 Median : 24.00
## Mean : 15.51 Mean : 4.211 Mean :0.01471 Mean : 47.84
## 3rd Qu.: 18.00 3rd Qu.: 3.000 3rd Qu.:0.00000 3rd Qu.: 43.25
## Max. :304.00 Max. :301.000 Max. :2.00000 Max. :2162.00
## loading.time
## Min. :0.00000
## 1st Qu.:0.08125
## Median :0.27400
## Mean :0.36143
## 3rd Qu.:0.50850
## Max. :4.06800
names(total_500_final)
## [1] "Revenues" "non.document.error" "number_of_errors"
## [4] "number_of_warning" "facebook" "instagram"
## [7] "linkedin" "pinterest" "twitter"
## [10] "youtube" "Flesh_Mesaure" "Readability"
## [13] "Sentences" "Unique.words" "Words"
## [16] "external" "internal" "total.links"
## [19] "X15x75" "X8x15" "X44x556"
## [22] "X1x1" "X800x1200" "autox100."
## [25] "X24pxx133px" "X21pxx173px" "X46x214"
## [28] "X49x49" "X50x45" "X400x300"
## [31] "X292pxx292px" "X200pxx200px" "X1279pxx984px"
## [34] "X300pxx1500px" "X29x29" "X115x223"
## [37] "X160x233" "X300x993" "X41x192"
## [40] "X28x221" "X15x12" "X60x60"
## [43] ".bmp" ".dib" ".gif"
## [46] ".jpe" ".jpeg" ".jpg"
## [49] ".png" ".tif" ".tiff"
## [52] "total.images" "loading.time"
total_500_final$X15x12<- gsub("1","0", total_500_final$X15x12)
total_500_final$X15x12 <- gsub("2", "1", total_500_final$X15x12 )
total_500_final$X60x60<- gsub("1","0", total_500_final$X60x60)
total_500_final$X60x60 <- gsub("2", "1", total_500_final$X60x60 )
total_500_final$X15x75<- gsub("1","0", total_500_final$X15x75)
total_500_final$X15x75 <- gsub("2", "1", total_500_final$X15x75 )
total_500_final$X28x221<- gsub("1","0", total_500_final$X28x221)
total_500_final$X28x221 <- gsub("2", "1", total_500_final$X28x221 )
total_500_final$X41x192 <- gsub("1","0", total_500_final$X41x192 )
total_500_final$X41x192 <- gsub("2", "1", total_500_final$X41x192 )
total_500_final$X300x993 <- gsub("1","0", total_500_final$X300x993 )
total_500_final$X300x993 <- gsub("2", "1", total_500_final$X300x993 )
total_500_final$X160x233 <- gsub("1","0", total_500_final$X160x233 )
total_500_final$X160x233 <- gsub("2", "1", total_500_final$X160x233 )
total_500_final$X29x29 <- gsub("1","0", total_500_final$X29x29 )
total_500_final$X29x29 <- gsub("2", "1", total_500_final$X29x29 )
total_500_final$X300pxx1500px <- gsub("1","0", total_500_final$X300pxx1500px )
total_500_final$X300pxx1500px <- gsub("2", "1", total_500_final$X300pxx1500px )
total_500_final$X200pxx200px<- gsub("1","0", total_500_final$X200pxx200px )
total_500_final$X200pxx200px <- gsub("2", "1", total_500_final$X200pxx200px )
total_500_final$X292pxx292px <- gsub("1","0", total_500_final$X292pxx292px )
total_500_final$X292pxx292px <- gsub("2", "1", total_500_final$X292pxx292px )
total_500_final$X400x300 <- gsub("1","0", total_500_final$X400x300 )
total_500_final$X400x300 <- gsub("2", "1", total_500_final$X400x300 )
total_500_final$X115x223 <- gsub("1","0", total_500_final$X115x223 )
total_500_final$X115x223 <- gsub("2", "1", total_500_final$X115x223 )
total_500_final$X1279pxx984px <- gsub("1","0", total_500_final$X1279pxx984px )
total_500_final$X1279pxx984px<- gsub("2", "1", total_500_final$X1279pxx984px )
total_500_final$X8x15 <- gsub("1","0", total_500_final$X8x15 )
total_500_final$X8x15 <- gsub("2", "1", total_500_final$X8x15 )
total_500_final$X44x556 <- gsub("1","0", total_500_final$X44x556 )
total_500_final$X44x556 <- gsub("2", "1", total_500_final$X44x556 )
total_500_final$X1x1 <- gsub("1","0", total_500_final$X1x1 )
total_500_final$X1x1 <- gsub("2", "1", total_500_final$X1x1 )
total_500_final$autox100. <- gsub("1","0", total_500_final$autox100. )
total_500_final$autox100. <- gsub("2", "1", total_500_final$autox100. )
colnames(total_500_final)[24] <- "X100x100"
total_500_final$X800x1200 <- gsub("1","0", total_500_final$X800x1200 )
total_500_final$X800x1200 <- gsub("2", "1", total_500_final$X800x1200 )
total_500_final$X24pxx133px <- gsub("1","0", total_500_final$X24pxx133px )
total_500_final$X24pxx133px <- gsub("2", "1", total_500_final$X24pxx133px )
total_500_final$X21pxx173px <- gsub("1","0", total_500_final$X21pxx173px )
total_500_final$X21pxx173px <- gsub("2", "1", total_500_final$X21pxx173px )
total_500_final$X46x214 <- gsub("1","0", total_500_final$X46x214)
total_500_final$X46x214 <- gsub("2", "1", total_500_final$X46x214 )
total_500_final$X49x49 <- gsub("1","0", total_500_final$X49x49)
total_500_final$X49x49 <- gsub("2", "1", total_500_final$X49x49 )
total_500_final$X50x45 <- gsub("1","0", total_500_final$X50x45)
total_500_final$X50x45 <- gsub("2", "1", total_500_final$X50x45 )
for(i in 19:42){
total_500_final[,i] <- as.numeric(total_500_final[,i])}
#We split the set to training and test set
library(caret)
set.seed(20)
sampling_vector <- createDataPartition(total_500_final$Revenues, p = 0.70, list = FALSE)
total_500_final_train <- total_500_final[sampling_vector,]
total_500_final_test <- total_500_final[-sampling_vector,]
#We will try to create a regression model to see which of the variables of the websites play the most important part regarding the Ranking of the company.
#We create the empty lm model
model_null = lm(Revenues~1,data=total_500_final_train)
summary(model_null)
##
## Call:
## lm(formula = Revenues ~ 1, data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.070 -16.119 -12.152 -2.342 210.515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.200 2.023 11.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.33 on 287 degrees of freedom
#####################################################################################################
#LASSO and Logistic Regression models
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-2
#We create a full model for the variable Ranking
full <- lm(Revenues~.,data=total_500_final_train)
summary(full)
##
## Call:
## lm(formula = Revenues ~ ., data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.227 -7.486 -3.338 1.303 66.316
##
## Coefficients: (13 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.266e+02 1.618e+01 14.004 < 2e-16 ***
## non.document.error 5.657e-01 2.195e+00 0.258 0.796868
## number_of_errors 1.230e-02 1.113e-02 1.105 0.270122
## number_of_warning 4.159e-03 4.276e-02 0.097 0.922592
## facebook -4.503e+00 2.796e+00 -1.611 0.108480
## instagram 1.358e+00 2.551e+00 0.532 0.595127
## linkedin -3.492e-01 2.435e+00 -0.143 0.886095
## pinterest -1.613e+00 3.598e+00 -0.448 0.654262
## twitter 2.911e+00 2.727e+00 1.068 0.286712
## youtube 5.036e+00 2.221e+00 2.267 0.024250 *
## Flesh_Mesaure -3.099e-02 3.103e-02 -0.999 0.318996
## Readability 6.973e-01 9.704e-01 0.719 0.473101
## Sentences 2.104e-03 1.216e-02 0.173 0.862755
## Unique.words -2.273e-02 2.089e-02 -1.088 0.277518
## Words 5.093e-03 5.053e-03 1.008 0.314457
## external -6.190e-04 1.927e-02 -0.032 0.974397
## internal -8.858e-04 1.083e-02 -0.082 0.934908
## total.links NA NA NA NA
## X15x75 -1.762e+01 2.070e+01 -0.851 0.395495
## X8x15 -3.169e+01 2.095e+01 -1.513 0.131677
## X44x556 -1.722e+01 2.104e+01 -0.819 0.413786
## X1x1 NA NA NA NA
## X800x1200 -1.293e+01 1.815e+01 -0.712 0.477033
## X100x100 -3.007e+01 5.554e+01 -0.541 0.588661
## X24pxx133px 2.467e+01 5.588e+01 0.441 0.659272
## X21pxx173px NA NA NA NA
## X46x214 -9.818e+00 1.819e+01 -0.540 0.589877
## X49x49 NA NA NA NA
## X50x45 -7.650e+00 1.819e+01 -0.421 0.674433
## X400x300 -2.693e+00 2.133e+01 -0.126 0.899623
## X292pxx292px -2.495e+01 1.887e+01 -1.322 0.187254
## X200pxx200px NA NA NA NA
## X1279pxx984px NA NA NA NA
## X300pxx1500px NA NA NA NA
## X29x29 -1.477e+00 1.823e+01 -0.081 0.935472
## X115x223 -1.150e+01 2.099e+01 -0.548 0.584217
## X160x233 NA NA NA NA
## X300x993 NA NA NA NA
## X41x192 NA NA NA NA
## X28x221 NA NA NA NA
## X15x12 -7.116e+01 1.501e+01 -4.741 3.59e-06 ***
## X60x60 NA NA NA NA
## .bmp 2.396e+00 6.357e-01 3.770 0.000204 ***
## .dib 1.431e+00 1.315e+00 1.088 0.277552
## .gif -1.133e-01 8.477e-02 -1.337 0.182535
## .jpe 1.133e-01 4.324e+00 0.026 0.979123
## .jpeg 3.710e-01 4.367e+00 0.085 0.932364
## .jpg 4.584e-03 3.071e-02 0.149 0.881438
## .png -1.891e-02 4.015e-02 -0.471 0.637976
## .tif 3.002e-03 4.911e-02 0.061 0.951302
## .tiff -1.127e+00 7.240e+00 -0.156 0.876394
## total.images NA NA NA NA
## loading.time -5.314e+00 2.578e+00 -2.061 0.040332 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.38 on 248 degrees of freedom
## Multiple R-squared: 0.8484, Adjusted R-squared: 0.8246
## F-statistic: 35.59 on 39 and 248 DF, p-value: < 2.2e-16
x <- model.matrix(full) [,-1]
dim(x)
## [1] 288 52
lasso <- glmnet (x, total_500_final_train$Revenues)
par(mfrow=c(1,1),no.readonly = TRUE)
plot(lasso, xvar='lambda', label=T)

lassob <- cv.glmnet(x,total_500_final_train$Revenues)
lassob$lambda.min
## [1] 0.4511413
lassob$lambda.1se
## [1] 2.642344
plot(lassob)

#We see the coefficients for lamda min
blasso <- coef(lassob, s="lambda.min")
blasso
## 53 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 2.230833e+02
## non.document.error .
## number_of_errors 6.149259e-03
## number_of_warning .
## facebook -8.053246e-01
## instagram .
## linkedin .
## pinterest .
## twitter 9.824382e-02
## youtube 3.711612e+00
## Flesh_Mesaure -1.720979e-02
## Readability .
## Sentences 2.566489e-03
## Unique.words .
## Words .
## external .
## internal .
## total.links .
## X15x75 -1.448135e+01
## X8x15 -3.095291e+01
## X44x556 -1.873621e+01
## X1x1 -5.102733e-11
## X800x1200 -1.075697e+01
## X100x100 -1.877745e+00
## X24pxx133px -2.682910e+00
## X21pxx173px .
## X46x214 -9.778805e+00
## X49x49 .
## X50x45 -7.787668e+00
## X400x300 -9.798054e+00
## X292pxx292px -1.209080e+01
## X200pxx200px -7.655717e-01
## X1279pxx984px -2.020626e-03
## X300pxx1500px -2.665792e-12
## X29x29 -4.932214e+00
## X115x223 -1.326506e+01
## X160x233 -2.505006e-10
## X300x993 -3.322224e-14
## X41x192 -9.966671e-14
## X28x221 -1.162778e-13
## X15x12 -6.972276e+01
## X60x60 -7.618330e-11
## .bmp 2.148840e+00
## .dib 3.222502e-01
## .gif -7.060778e-02
## .jpe .
## .jpeg .
## .jpg 1.022970e-02
## .png .
## .tif .
## .tiff .
## total.images .
## loading.time -3.897970e+00
dim(blasso)
## [1] 53 1
zblasso <- blasso[-1] * apply(x,2,sd)
zbolt <- coef (full) [-1] * apply (x,2,sd)
azbolt <- abs(zbolt)
sum(azbolt)
## [1] NA
#since the sum is NA that means we have to substract some variables
# in order to find which variables to substract we run the coefficients and we see which of them has NA as result
coef(full)
## (Intercept) non.document.error number_of_errors
## 2.265809e+02 5.656880e-01 1.230253e-02
## number_of_warning facebook instagram
## 4.158870e-03 -4.503280e+00 1.357670e+00
## linkedin pinterest twitter
## -3.491824e-01 -1.613471e+00 2.911065e+00
## youtube Flesh_Mesaure Readability
## 5.036021e+00 -3.098642e-02 6.972808e-01
## Sentences Unique.words Words
## 2.104257e-03 -2.273419e-02 5.093208e-03
## external internal total.links
## -6.190402e-04 -8.857651e-04 NA
## X15x75 X8x15 X44x556
## -1.761698e+01 -3.169113e+01 -1.722275e+01
## X1x1 X800x1200 X100x100
## NA -1.292930e+01 -3.007324e+01
## X24pxx133px X21pxx173px X46x214
## 2.466914e+01 NA -9.817905e+00
## X49x49 X50x45 X400x300
## NA -7.650074e+00 -2.692975e+00
## X292pxx292px X200pxx200px X1279pxx984px
## -2.494753e+01 NA NA
## X300pxx1500px X29x29 X115x223
## NA -1.477068e+00 -1.150271e+01
## X160x233 X300x993 X41x192
## NA NA NA
## X28x221 X15x12 X60x60
## NA -7.115613e+01 NA
## .bmp .dib .gif
## 2.396454e+00 1.430784e+00 -1.133168e-01
## .jpe .jpeg .jpg
## 1.132600e-01 3.710030e-01 4.584279e-03
## .png .tif .tiff
## -1.891363e-02 3.002462e-03 -1.127253e+00
## total.images loading.time
## NA -5.314492e+00
#Now we create a new model with only the variables with coef different from NA
full_2 <- lm(Revenues~. - total.images - total.links - X1x1 - X21pxx173px - X49x49 - X200pxx200px - X1279pxx984px - X300pxx1500px - X160x233 - X300x993 - X41x192 - X28x221 - X60x60,data=total_500_final_train)
summary(full_2)
##
## Call:
## lm(formula = Revenues ~ . - total.images - total.links - X1x1 -
## X21pxx173px - X49x49 - X200pxx200px - X1279pxx984px - X300pxx1500px -
## X160x233 - X300x993 - X41x192 - X28x221 - X60x60, data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.227 -7.486 -3.338 1.303 66.316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.266e+02 1.618e+01 14.004 < 2e-16 ***
## non.document.error 5.657e-01 2.195e+00 0.258 0.796868
## number_of_errors 1.230e-02 1.113e-02 1.105 0.270122
## number_of_warning 4.159e-03 4.276e-02 0.097 0.922592
## facebook -4.503e+00 2.796e+00 -1.611 0.108480
## instagram 1.358e+00 2.551e+00 0.532 0.595127
## linkedin -3.492e-01 2.435e+00 -0.143 0.886095
## pinterest -1.613e+00 3.598e+00 -0.448 0.654262
## twitter 2.911e+00 2.727e+00 1.068 0.286712
## youtube 5.036e+00 2.221e+00 2.267 0.024250 *
## Flesh_Mesaure -3.099e-02 3.103e-02 -0.999 0.318996
## Readability 6.973e-01 9.704e-01 0.719 0.473101
## Sentences 2.104e-03 1.216e-02 0.173 0.862755
## Unique.words -2.273e-02 2.089e-02 -1.088 0.277518
## Words 5.093e-03 5.053e-03 1.008 0.314457
## external -6.190e-04 1.927e-02 -0.032 0.974397
## internal -8.858e-04 1.083e-02 -0.082 0.934908
## X15x75 -1.762e+01 2.070e+01 -0.851 0.395495
## X8x15 -3.169e+01 2.095e+01 -1.513 0.131677
## X44x556 -1.722e+01 2.104e+01 -0.819 0.413786
## X800x1200 -1.293e+01 1.815e+01 -0.712 0.477033
## X100x100 -3.007e+01 5.554e+01 -0.541 0.588661
## X24pxx133px 2.467e+01 5.588e+01 0.441 0.659272
## X46x214 -9.818e+00 1.819e+01 -0.540 0.589877
## X50x45 -7.650e+00 1.819e+01 -0.421 0.674433
## X400x300 -2.693e+00 2.133e+01 -0.126 0.899623
## X292pxx292px -2.495e+01 1.887e+01 -1.322 0.187254
## X29x29 -1.477e+00 1.823e+01 -0.081 0.935472
## X115x223 -1.150e+01 2.099e+01 -0.548 0.584217
## X15x12 -7.116e+01 1.501e+01 -4.741 3.59e-06 ***
## .bmp 2.396e+00 6.357e-01 3.770 0.000204 ***
## .dib 1.431e+00 1.315e+00 1.088 0.277552
## .gif -1.133e-01 8.477e-02 -1.337 0.182535
## .jpe 1.133e-01 4.324e+00 0.026 0.979123
## .jpeg 3.710e-01 4.367e+00 0.085 0.932364
## .jpg 4.584e-03 3.071e-02 0.149 0.881438
## .png -1.891e-02 4.015e-02 -0.471 0.637976
## .tif 3.002e-03 4.911e-02 0.061 0.951302
## .tiff -1.127e+00 7.240e+00 -0.156 0.876394
## loading.time -5.314e+00 2.578e+00 -2.061 0.040332 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.38 on 248 degrees of freedom
## Multiple R-squared: 0.8484, Adjusted R-squared: 0.8246
## F-statistic: 35.59 on 39 and 248 DF, p-value: < 2.2e-16
x <- model.matrix(full_2) [,-c(18,22,28,26,34,32,33,42,37,38,39,40,52)]
dim(x)
## [1] 288 29
lasso <- glmnet (x, total_500_final_train$Revenues)
plot(lasso, xvar='lambda', label=T)

lassob <- cv.glmnet(x,total_500_final_train$Revenues)
lassob$lambda.min
## [1] 0.495127
lassob$lambda.1se
## [1] 3.833588
plot(lassob)

#coefiecinets for lammda min
blasso <- coef(lassob, s="lambda.min")
blasso
## 30 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 215.050596035
## (Intercept) .
## non.document.error .
## number_of_errors 0.004545402
## number_of_warning .
## facebook -0.465821955
## instagram .
## linkedin .
## pinterest -0.442067577
## twitter .
## youtube 3.714394348
## Flesh_Mesaure -0.012455651
## Readability .
## Sentences 0.001245264
## Unique.words .
## Words .
## external .
## internal .
## X8x15 -39.910397889
## X44x556 -19.570357469
## X800x1200 -9.091373726
## X24pxx133px -3.964585608
## X46x214 -8.939435748
## X50x45 -12.779830022
## X292pxx292px -19.158990860
## X115x223 -17.248924356
## X15x12 -70.512070793
## .bmp 2.152463769
## .jpeg .
## .jpg 0.008247367
dim(blasso)
## [1] 30 1
zblasso <- blasso[-1] * apply(x,2,sd)
zbolt <- coef (full_2) [-1] * apply (x,2,sd)
## Warning in coef(full_2)[-1] * apply(x, 2, sd): longer object length is not
## a multiple of shorter object length
azbolt <- abs(zbolt)
sum(azbolt)
## [1] 5546.237
s <- sum(abs(zblasso))/sum(abs(azbolt))
s
## [1] 0.007532779
full_3 <- lm(Revenues~1 +number_of_errors +facebook +pinterest +youtube+ Flesh_Mesaure +Sentences +X8x15 +X44x556 +X800x1200 +X24pxx133px +X46x214 +X50x45 +X292pxx292px +X115x223 +X15x12 +.bmp +.jpg ,data=total_500_final_train)
summary(full_3)
##
## Call:
## lm(formula = Revenues ~ 1 + number_of_errors + facebook + pinterest +
## youtube + Flesh_Mesaure + Sentences + X8x15 + X44x556 + X800x1200 +
## X24pxx133px + X46x214 + X50x45 + X292pxx292px + X115x223 +
## X15x12 + .bmp + .jpg, data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.444 -7.482 -3.728 1.968 67.468
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 220.708931 10.142389 21.761 < 2e-16 ***
## number_of_errors 0.009297 0.009860 0.943 0.346575
## facebook -2.590609 2.142461 -1.209 0.227654
## pinterest -1.966700 3.155150 -0.623 0.533594
## youtube 5.721687 2.009439 2.847 0.004746 **
## Flesh_Mesaure -0.028894 0.024264 -1.191 0.234779
## Sentences 0.003712 0.005628 0.660 0.510092
## X8x15 -43.592759 17.439657 -2.500 0.013026 *
## X44x556 -16.356982 20.137030 -0.812 0.417344
## X800x1200 -12.902443 16.481121 -0.783 0.434394
## X24pxx133px -2.075224 16.689391 -0.124 0.901135
## X46x214 -9.506257 17.481911 -0.544 0.587044
## X50x45 -13.137735 14.219494 -0.924 0.356351
## X292pxx292px -20.460731 12.440363 -1.645 0.101194
## X115x223 -15.298995 16.102120 -0.950 0.342899
## X15x12 -73.307867 14.451593 -5.073 7.3e-07 ***
## .bmp 2.435106 0.619522 3.931 0.000108 ***
## .jpg 0.015724 0.020551 0.765 0.444875
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.14 on 270 degrees of freedom
## Multiple R-squared: 0.8405, Adjusted R-squared: 0.8304
## F-statistic: 83.67 on 17 and 270 DF, p-value: < 2.2e-16
ad_r_sq_f3 <- summary(full_3)$adj.r.squared
aic_f3 <- AIC(full_3)
plot(full_3,which=1:3)
## Warning: not plotting observations with leverage one:
## 3, 4, 8, 17



##############################################
blassob <- coef(lassob, s="lambda.1se")
blassob
## 30 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 176.417391
## (Intercept) .
## non.document.error .
## number_of_errors .
## number_of_warning .
## facebook .
## instagram .
## linkedin .
## pinterest .
## twitter .
## youtube .
## Flesh_Mesaure .
## Readability .
## Sentences .
## Unique.words .
## Words .
## external .
## internal .
## X8x15 -15.418145
## X44x556 -21.240420
## X800x1200 -1.553671
## X24pxx133px -2.926734
## X46x214 -9.772246
## X50x45 -10.987677
## X292pxx292px -18.334032
## X115x223 -15.592779
## X15x12 -63.865600
## .bmp .
## .jpeg .
## .jpg .
zblassob <- blassob[-1] * apply(x,2,sd)
zboltb <- coef (full_2) [-1] * apply (x,2,sd)
## Warning in coef(full_2)[-1] * apply(x, 2, sd): longer object length is not
## a multiple of shorter object length
s <- sum(abs(zblassob))/sum(abs(zboltb))
s
## [1] 0.005413772
#The model based on the lasso method by taking the lambda.1se is the null model only with the intercept
full_4 <- lm(Revenues~1 +X8x15 +X44x556 +X800x1200 +X24pxx133px +X46x214 +X50x45 +X292pxx292px +X115x223 +X15x12 ,data=total_500_final_train)
summary(full_4)
##
## Call:
## lm(formula = Revenues ~ 1 + X8x15 + X44x556 + X800x1200 + X24pxx133px +
## X46x214 + X50x45 + X292pxx292px + X115x223 + X15x12, data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.447 -8.735 -4.926 1.100 66.695
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 222.268 10.408 21.356 < 2e-16 ***
## X8x15 -41.027 18.027 -2.276 0.0236 *
## X44x556 -24.134 20.816 -1.159 0.2473
## X800x1200 -5.372 16.996 -0.316 0.7522
## X24pxx133px -4.934 16.996 -0.290 0.7718
## X46x214 -10.796 18.027 -0.599 0.5497
## X50x45 -12.346 14.719 -0.839 0.4023
## X292pxx292px -19.575 12.747 -1.536 0.1258
## X115x223 -15.565 16.456 -0.946 0.3450
## X15x12 -72.753 14.746 -4.934 1.39e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.72 on 278 degrees of freedom
## Multiple R-squared: 0.822, Adjusted R-squared: 0.8162
## F-statistic: 142.6 on 9 and 278 DF, p-value: < 2.2e-16
ad_r_sq_f4 <- summary(full_4)$adj.r.squared
aic_f4 <- AIC(full_4)
plot(full_4,which=1:3)
## Warning: not plotting observations with leverage one:
## 4, 8



###############################################
#We use the "both" method to compare the full_3 model with the null model to see how many variables are indeed important
model_a <- step(model_null, scope = list(lower = model_null, upper=full_2), direction = "both")
## Start: AIC=2037.77
## Revenues ~ 1
##
## Df Sum of Sq RSS AIC
## + X15x12 1 253735 84542 1640.4
## + X115x223 1 251468 86810 1648.0
## + X29x29 1 245941 92337 1665.8
## + X292pxx292px 1 229850 108428 1712.1
## + X400x300 1 222075 116203 1732.0
## + X50x45 1 210336 127942 1759.8
## + X46x214 1 184923 153355 1811.9
## + X24pxx133px 1 169418 168860 1839.7
## + X100x100 1 153400 184878 1865.8
## + X800x1200 1 120731 217547 1912.6
## + X44x556 1 104196 234082 1933.7
## + X8x15 1 79810 258468 1962.3
## + X15x75 1 44471 293807 1999.2
## + .jpe 1 14156 324122 2027.5
## + .jpeg 1 13535 324743 2028.0
## + .dib 1 7274 331004 2033.5
## + loading.time 1 5444 332834 2035.1
## + number_of_warning 1 4240 334037 2036.1
## + non.document.error 1 3575 334703 2036.7
## + youtube 1 3338 334940 2036.9
## + .bmp 1 2634 335644 2037.5
## <none> 338278 2037.8
## + number_of_errors 1 1708 336570 2038.3
## + pinterest 1 1505 336773 2038.5
## + internal 1 907 337371 2039.0
## + .gif 1 666 337612 2039.2
## + instagram 1 646 337632 2039.2
## + linkedin 1 570 337708 2039.3
## + Readability 1 563 337715 2039.3
## + Unique.words 1 552 337726 2039.3
## + Words 1 201 338077 2039.6
## + Sentences 1 189 338089 2039.6
## + external 1 131 338147 2039.7
## + .png 1 119 338159 2039.7
## + .tiff 1 107 338171 2039.7
## + twitter 1 106 338172 2039.7
## + Flesh_Mesaure 1 54 338224 2039.7
## + .tif 1 9 338269 2039.8
## + facebook 1 7 338271 2039.8
## + .jpg 1 4 338274 2039.8
##
## Step: AIC=1640.43
## Revenues ~ X15x12
##
## Df Sum of Sq RSS AIC
## + X44x556 1 16295 68247 1580.8
## + X800x1200 1 15259 69283 1585.1
## + X46x214 1 14900 69642 1586.6
## + X24pxx133px 1 14711 69832 1587.4
## + X8x15 1 14710 69833 1587.4
## + X100x100 1 14623 69919 1587.7
## + X50x45 1 13965 70577 1590.4
## + X400x300 1 13532 71011 1592.2
## + X292pxx292px 1 11756 72787 1599.3
## + X15x75 1 8994 75548 1610.0
## + X29x29 1 4835 79708 1625.5
## + .bmp 1 3641 80901 1629.8
## + X115x223 1 3006 81536 1632.0
## + youtube 1 1650 82892 1636.8
## + loading.time 1 908 83635 1639.3
## + .dib 1 710 83832 1640.0
## + .gif 1 599 83943 1640.4
## <none> 84542 1640.4
## + pinterest 1 513 84030 1640.7
## + Readability 1 400 84142 1641.1
## + instagram 1 372 84171 1641.2
## + linkedin 1 332 84211 1641.3
## + .jpg 1 257 84285 1641.5
## + Flesh_Mesaure 1 243 84299 1641.6
## + number_of_errors 1 219 84324 1641.7
## + number_of_warning 1 174 84368 1641.8
## + .jpe 1 158 84384 1641.9
## + .jpeg 1 152 84390 1641.9
## + external 1 87 84455 1642.1
## + facebook 1 76 84466 1642.2
## + non.document.error 1 41 84502 1642.3
## + .png 1 31 84511 1642.3
## + twitter 1 28 84514 1642.3
## + Sentences 1 18 84525 1642.4
## + Words 1 12 84530 1642.4
## + .tif 1 11 84531 1642.4
## + Unique.words 1 2 84541 1642.4
## + internal 1 1 84542 1642.4
## + .tiff 1 0 84542 1642.4
## - X15x12 1 253735 338278 2037.8
##
## Step: AIC=1580.76
## Revenues ~ X15x12 + X44x556
##
## Df Sum of Sq RSS AIC
## + X400x300 1 5982 62265 1556.3
## + X50x45 1 5558 62689 1558.3
## + X292pxx292px 1 5424 62823 1558.9
## + X46x214 1 4647 63600 1562.5
## + X24pxx133px 1 3699 64548 1566.7
## + .bmp 1 3641 64606 1567.0
## + X100x100 1 2757 65490 1570.9
## + X29x29 1 2367 65880 1572.6
## + X115x223 1 1626 66621 1575.8
## + youtube 1 1604 66643 1575.9
## + X8x15 1 1122 67125 1578.0
## + X800x1200 1 952 67295 1578.7
## + loading.time 1 950 67297 1578.7
## + X15x75 1 947 67300 1578.7
## + .jpe 1 730 67517 1579.7
## + .jpeg 1 691 67556 1579.8
## <none> 68247 1580.8
## + .gif 1 414 67833 1581.0
## + .jpg 1 394 67853 1581.1
## + number_of_errors 1 348 67899 1581.3
## + twitter 1 286 67961 1581.5
## + linkedin 1 254 67993 1581.7
## + instagram 1 252 67995 1581.7
## + external 1 189 68058 1582.0
## + Flesh_Mesaure 1 186 68061 1582.0
## + number_of_warning 1 168 68079 1582.0
## + Readability 1 162 68085 1582.1
## + .dib 1 144 68103 1582.2
## + pinterest 1 121 68126 1582.2
## + Sentences 1 99 68148 1582.3
## + Words 1 37 68210 1582.6
## + .tif 1 37 68210 1582.6
## + non.document.error 1 19 68228 1582.7
## + internal 1 10 68237 1582.7
## + .png 1 8 68239 1582.7
## + facebook 1 7 68240 1582.7
## + Unique.words 1 2 68245 1582.8
## + .tiff 1 0 68247 1582.8
## - X44x556 1 16295 84542 1640.4
## - X15x12 1 165835 234082 1933.7
##
## Step: AIC=1556.34
## Revenues ~ X15x12 + X44x556 + X400x300
##
## Df Sum of Sq RSS AIC
## + .bmp 1 3641 58624 1541.0
## + youtube 1 1284 60981 1552.3
## + X8x15 1 1122 61143 1553.1
## + X15x75 1 947 61318 1553.9
## + loading.time 1 856 61409 1554.4
## + X46x214 1 571 61694 1555.7
## + X24pxx133px 1 486 61778 1556.1
## + .jpg 1 459 61806 1556.2
## <none> 62265 1556.3
## + .gif 1 416 61849 1556.4
## + X100x100 1 386 61879 1556.5
## + X115x223 1 270 61995 1557.1
## + number_of_errors 1 244 62021 1557.2
## + X50x45 1 229 62036 1557.3
## + X292pxx292px 1 193 62072 1557.5
## + X29x29 1 191 62074 1557.5
## + twitter 1 172 62093 1557.5
## + X800x1200 1 160 62105 1557.6
## + Flesh_Mesaure 1 136 62129 1557.7
## + Sentences 1 115 62150 1557.8
## + Readability 1 112 62153 1557.8
## + .jpe 1 100 62165 1557.9
## + .dib 1 100 62165 1557.9
## + .jpeg 1 97 62168 1557.9
## + external 1 97 62168 1557.9
## + linkedin 1 79 62186 1558.0
## + pinterest 1 74 62191 1558.0
## + .tif 1 68 62197 1558.0
## + instagram 1 67 62198 1558.0
## + Words 1 32 62233 1558.2
## + internal 1 17 62247 1558.3
## + Unique.words 1 6 62259 1558.3
## + number_of_warning 1 2 62263 1558.3
## + facebook 1 1 62264 1558.3
## + non.document.error 1 1 62264 1558.3
## + .tiff 1 0 62265 1558.3
## + .png 1 0 62265 1558.3
## - X400x300 1 5982 68247 1580.8
## - X44x556 1 8746 71011 1592.2
## - X15x12 1 45193 107457 1711.5
##
## Step: AIC=1540.99
## Revenues ~ X15x12 + X44x556 + X400x300 + .bmp
##
## Df Sum of Sq RSS AIC
## + X8x15 1 1122 57501 1537.4
## + youtube 1 1035 57588 1537.9
## + X15x75 1 947 57677 1538.3
## + loading.time 1 742 57882 1539.3
## + X46x214 1 571 58053 1540.2
## + X24pxx133px 1 486 58137 1540.6
## + .jpg 1 429 58195 1540.9
## <none> 58624 1541.0
## + .gif 1 401 58223 1541.0
## + X100x100 1 386 58237 1541.1
## + twitter 1 321 58302 1541.4
## + X115x223 1 270 58354 1541.7
## + number_of_errors 1 232 58391 1541.8
## + X50x45 1 229 58394 1541.9
## + Sentences 1 209 58414 1542.0
## + X292pxx292px 1 193 58430 1542.0
## + X29x29 1 191 58432 1542.0
## + linkedin 1 167 58457 1542.2
## + X800x1200 1 160 58464 1542.2
## + Flesh_Mesaure 1 141 58482 1542.3
## + .jpe 1 106 58517 1542.5
## + .jpeg 1 102 58521 1542.5
## + .dib 1 90 58534 1542.5
## + instagram 1 87 58536 1542.6
## + Words 1 80 58544 1542.6
## + Readability 1 79 58545 1542.6
## + pinterest 1 61 58562 1542.7
## + external 1 57 58567 1542.7
## + internal 1 52 58571 1542.7
## + .tif 1 26 58597 1542.9
## + facebook 1 11 58613 1542.9
## + number_of_warning 1 3 58620 1543.0
## + Unique.words 1 1 58623 1543.0
## + non.document.error 1 1 58623 1543.0
## + .png 1 0 58623 1543.0
## + .tiff 1 0 58624 1543.0
## - .bmp 1 3641 62265 1556.3
## - X400x300 1 5982 64606 1567.0
## - X44x556 1 8746 67369 1579.0
## - X15x12 1 45466 104089 1704.3
##
## Step: AIC=1537.42
## Revenues ~ X15x12 + X44x556 + X400x300 + .bmp + X8x15
##
## Df Sum of Sq RSS AIC
## + youtube 1 1146 56355 1533.6
## + loading.time 1 601 56900 1536.4
## + X46x214 1 571 56931 1536.5
## + X24pxx133px 1 486 57015 1537.0
## + twitter 1 464 57037 1537.1
## + .jpg 1 438 57063 1537.2
## + .gif 1 407 57094 1537.4
## <none> 57501 1537.4
## + X100x100 1 386 57115 1537.5
## + number_of_errors 1 273 57229 1538.0
## + X115x223 1 270 57232 1538.1
## + X15x75 1 262 57239 1538.1
## + Sentences 1 258 57243 1538.1
## + X50x45 1 229 57272 1538.3
## + linkedin 1 212 57289 1538.4
## + X292pxx292px 1 193 57308 1538.5
## + X29x29 1 191 57310 1538.5
## + Flesh_Mesaure 1 180 57322 1538.5
## + instagram 1 179 57323 1538.5
## + X800x1200 1 160 57342 1538.6
## + Readability 1 129 57372 1538.8
## + .jpe 1 106 57395 1538.9
## + .jpeg 1 102 57399 1538.9
## + Words 1 101 57400 1538.9
## + .dib 1 90 57412 1539.0
## + pinterest 1 61 57440 1539.1
## + internal 1 58 57443 1539.1
## + external 1 57 57444 1539.1
## + facebook 1 46 57456 1539.2
## + number_of_warning 1 38 57464 1539.2
## + .tif 1 26 57475 1539.3
## + Unique.words 1 11 57490 1539.4
## + non.document.error 1 1 57501 1539.4
## + .png 1 0 57501 1539.4
## + .tiff 1 0 57501 1539.4
## - X8x15 1 1122 58624 1541.0
## - X44x556 1 1150 58651 1541.1
## - .bmp 1 3641 61143 1553.1
## - X400x300 1 5982 63484 1563.9
## - X15x12 1 45466 102967 1703.2
##
## Step: AIC=1533.62
## Revenues ~ X15x12 + X44x556 + X400x300 + .bmp + X8x15 + youtube
##
## Df Sum of Sq RSS AIC
## + loading.time 1 617 55738 1532.5
## + X46x214 1 536 55819 1532.9
## + X24pxx133px 1 486 55869 1533.1
## + X100x100 1 416 55939 1533.5
## <none> 56355 1533.6
## + .gif 1 389 55966 1533.6
## + X115x223 1 349 56006 1533.8
## + X29x29 1 304 56051 1534.1
## + number_of_errors 1 302 56053 1534.1
## + .jpg 1 284 56071 1534.2
## + X50x45 1 264 56091 1534.3
## + X292pxx292px 1 261 56094 1534.3
## + X800x1200 1 255 56100 1534.3
## + Flesh_Mesaure 1 210 56145 1534.5
## + pinterest 1 200 56155 1534.6
## + X15x75 1 179 56176 1534.7
## + Sentences 1 152 56203 1534.8
## + facebook 1 138 56216 1534.9
## + .dib 1 104 56251 1535.1
## + Readability 1 97 56258 1535.1
## + .jpe 1 74 56281 1535.2
## + .jpeg 1 72 56283 1535.2
## + Words 1 62 56293 1535.3
## + twitter 1 37 56318 1535.4
## + external 1 34 56321 1535.5
## + number_of_warning 1 32 56323 1535.5
## + .tif 1 24 56331 1535.5
## + internal 1 15 56340 1535.5
## + instagram 1 10 56345 1535.6
## + .tiff 1 5 56350 1535.6
## + .png 1 2 56353 1535.6
## + Unique.words 1 1 56353 1535.6
## + linkedin 1 0 56354 1535.6
## + non.document.error 1 0 56355 1535.6
## - X44x556 1 1085 57440 1537.1
## - youtube 1 1146 57501 1537.4
## - X8x15 1 1233 57588 1537.9
## - .bmp 1 3380 59735 1548.4
## - X400x300 1 5677 62032 1559.3
## - X15x12 1 45753 102108 1702.8
##
## Step: AIC=1532.45
## Revenues ~ X15x12 + X44x556 + X400x300 + .bmp + X8x15 + youtube +
## loading.time
##
## Df Sum of Sq RSS AIC
## + X46x214 1 597 55141 1531.3
## + X24pxx133px 1 518 55220 1531.8
## + X100x100 1 446 55292 1532.1
## + number_of_errors 1 395 55343 1532.4
## <none> 55738 1532.5
## + X115x223 1 384 55354 1532.5
## + .gif 1 377 55361 1532.5
## + X29x29 1 357 55381 1532.6
## + .jpg 1 353 55385 1532.6
## + X800x1200 1 332 55406 1532.7
## + Sentences 1 287 55451 1533.0
## + X50x45 1 282 55456 1533.0
## + Flesh_Mesaure 1 253 55485 1533.1
## + X292pxx292px 1 241 55497 1533.2
## + pinterest 1 193 55545 1533.5
## + Words 1 189 55549 1533.5
## + X15x75 1 180 55558 1533.5
## + .dib 1 177 55561 1533.5
## - loading.time 1 617 56355 1533.6
## + facebook 1 139 55599 1533.7
## + external 1 115 55623 1533.9
## + Readability 1 93 55645 1534.0
## + .jpe 1 76 55662 1534.1
## + .jpeg 1 75 55663 1534.1
## + internal 1 74 55664 1534.1
## + twitter 1 65 55673 1534.1
## + .tif 1 54 55684 1534.2
## + Unique.words 1 50 55688 1534.2
## + .png 1 32 55706 1534.3
## + number_of_warning 1 25 55713 1534.3
## + instagram 1 16 55722 1534.4
## + .tiff 1 8 55730 1534.4
## + linkedin 1 7 55731 1534.4
## + non.document.error 1 0 55738 1534.5
## - X8x15 1 1085 56823 1536.0
## - youtube 1 1162 56900 1536.4
## - X44x556 1 1199 56937 1536.6
## - .bmp 1 3278 59016 1546.9
## - X400x300 1 5596 61334 1558.0
## - X15x12 1 45266 101004 1701.7
##
## Step: AIC=1531.35
## Revenues ~ X15x12 + X44x556 + X400x300 + .bmp + X8x15 + youtube +
## loading.time + X46x214
##
## Df Sum of Sq RSS AIC
## + .gif 1 405 54736 1531.2
## + number_of_errors 1 405 54736 1531.2
## + X115x223 1 385 54756 1531.3
## <none> 55141 1531.3
## + X29x29 1 358 54783 1531.5
## + .jpg 1 335 54806 1531.6
## + Flesh_Mesaure 1 284 54857 1531.9
## + Sentences 1 273 54868 1531.9
## + X292pxx292px 1 239 54902 1532.1
## + X15x75 1 181 54960 1532.4
## + Words 1 174 54967 1532.4
## + facebook 1 172 54969 1532.5
## - X46x214 1 597 55738 1532.5
## + .dib 1 167 54974 1532.5
## + pinterest 1 165 54976 1532.5
## + Readability 1 148 54993 1532.6
## + X800x1200 1 134 55007 1532.7
## - loading.time 1 678 55819 1532.9
## + external 1 87 55054 1532.9
## + internal 1 68 55073 1533.0
## + X100x100 1 65 55076 1533.0
## + Unique.words 1 53 55087 1533.1
## + .tif 1 51 55090 1533.1
## + twitter 1 47 55094 1533.1
## + X24pxx133px 1 37 55104 1533.2
## + X50x45 1 31 55110 1533.2
## + number_of_warning 1 21 55120 1533.2
## + instagram 1 15 55126 1533.3
## - X44x556 1 760 55901 1533.3
## + .png 1 11 55130 1533.3
## + .tiff 1 8 55133 1533.3
## + linkedin 1 7 55134 1533.3
## + .jpe 1 6 55135 1533.3
## + .jpeg 1 5 55136 1533.3
## + non.document.error 1 0 55141 1533.3
## - X8x15 1 1076 56217 1534.9
## - youtube 1 1126 56267 1535.2
## - X400x300 1 1712 56853 1538.2
## - .bmp 1 3277 58418 1546.0
## - X15x12 1 45240 100381 1701.9
##
## Step: AIC=1531.23
## Revenues ~ X15x12 + X44x556 + X400x300 + .bmp + X8x15 + youtube +
## loading.time + X46x214 + .gif
##
## Df Sum of Sq RSS AIC
## + number_of_errors 1 457 54279 1530.8
## <none> 54736 1531.2
## + X115x223 1 366 54369 1531.3
## - .gif 1 405 55141 1531.3
## + X29x29 1 355 54381 1531.3
## + .jpg 1 353 54383 1531.4
## + Flesh_Mesaure 1 293 54443 1531.7
## + X292pxx292px 1 237 54499 1532.0
## + Sentences 1 223 54513 1532.0
## + facebook 1 200 54536 1532.2
## + .dib 1 183 54553 1532.3
## + X15x75 1 182 54554 1532.3
## + Readability 1 154 54582 1532.4
## + Words 1 147 54589 1532.5
## - X46x214 1 625 55361 1532.5
## + pinterest 1 126 54610 1532.6
## + X800x1200 1 117 54619 1532.6
## - loading.time 1 666 55402 1532.7
## + external 1 93 54642 1532.7
## + internal 1 63 54673 1532.9
## + .tif 1 59 54677 1532.9
## + X24pxx133px 1 45 54691 1533.0
## + X100x100 1 41 54695 1533.0
## + Unique.words 1 40 54696 1533.0
## - X44x556 1 726 55462 1533.0
## + .png 1 36 54700 1533.0
## + .jpe 1 30 54706 1533.1
## + .jpeg 1 29 54707 1533.1
## + X50x45 1 28 54708 1533.1
## + twitter 1 22 54714 1533.1
## + number_of_warning 1 20 54716 1533.1
## + instagram 1 20 54716 1533.1
## + .tiff 1 6 54730 1533.2
## + non.document.error 1 6 54730 1533.2
## + linkedin 1 0 54736 1533.2
## - X8x15 1 1083 55819 1534.9
## - youtube 1 1107 55843 1535.0
## - X400x300 1 1683 56419 1538.0
## - .bmp 1 3266 58002 1545.9
## - X15x12 1 45298 100033 1702.9
##
## Step: AIC=1530.81
## Revenues ~ X15x12 + X44x556 + X400x300 + .bmp + X8x15 + youtube +
## loading.time + X46x214 + .gif + number_of_errors
##
## Df Sum of Sq RSS AIC
## <none> 54279 1530.8
## + X115x223 1 360 53919 1530.9
## + Flesh_Mesaure 1 353 53926 1530.9
## + X29x29 1 334 53945 1531.0
## - number_of_errors 1 457 54736 1531.2
## - .gif 1 458 54736 1531.2
## + facebook 1 284 53995 1531.3
## + Readability 1 219 54060 1531.7
## + X292pxx292px 1 195 54084 1531.8
## + X15x75 1 178 54101 1531.9
## + .jpg 1 176 54102 1531.9
## + .dib 1 165 54114 1531.9
## + pinterest 1 138 54141 1532.1
## + X800x1200 1 134 54144 1532.1
## - X46x214 1 638 54916 1532.2
## + Sentences 1 83 54196 1532.4
## + Words 1 73 54206 1532.4
## + non.document.error 1 63 54216 1532.5
## + X100x100 1 62 54216 1532.5
## + X50x45 1 49 54230 1532.5
## + X24pxx133px 1 37 54242 1532.6
## - X44x556 1 726 55004 1532.6
## + external 1 22 54257 1532.7
## + twitter 1 15 54263 1532.7
## + .png 1 14 54264 1532.7
## + Unique.words 1 13 54265 1532.7
## + .tif 1 10 54269 1532.8
## + instagram 1 6 54273 1532.8
## + .tiff 1 5 54274 1532.8
## + .jpe 1 4 54275 1532.8
## + .jpeg 1 3 54275 1532.8
## + linkedin 1 3 54276 1532.8
## + number_of_warning 1 0 54278 1532.8
## + internal 1 0 54278 1532.8
## - loading.time 1 771 55050 1532.9
## - X8x15 1 1126 55405 1534.7
## - youtube 1 1142 55421 1534.8
## - X400x300 1 1610 55888 1537.2
## - .bmp 1 3237 57516 1545.5
## - X15x12 1 45164 99442 1703.2
summary(model_a)
##
## Call:
## lm(formula = Revenues ~ X15x12 + X44x556 + X400x300 + .bmp +
## X8x15 + youtube + loading.time + X46x214 + .gif + number_of_errors,
## data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.254 -7.695 -3.529 2.137 68.673
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 220.273625 9.934832 22.172 < 2e-16 ***
## X15x12 -87.818989 5.784528 -15.182 < 2e-16 ***
## X44x556 -29.578085 15.369922 -1.924 0.05533 .
## X400x300 -28.443747 9.923508 -2.866 0.00447 **
## .bmp 2.470545 0.607833 4.065 6.28e-05 ***
## X8x15 -41.311818 17.232973 -2.397 0.01718 *
## youtube 4.058630 1.681080 2.414 0.01641 *
## loading.time -4.490750 2.264113 -1.983 0.04830 *
## X46x214 -18.477039 10.243269 -1.804 0.07235 .
## .gif -0.120050 0.078556 -1.528 0.12760
## number_of_errors 0.013675 0.008952 1.528 0.12774
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14 on 277 degrees of freedom
## Multiple R-squared: 0.8395, Adjusted R-squared: 0.8338
## F-statistic: 144.9 on 10 and 277 DF, p-value: < 2.2e-16
ad_r_sq_ma <- summary(model_a)$adj.r.squared
aic_ma <- AIC(model_a)
plot(model_a,which=1:3)
## Warning: not plotting observations with leverage one:
## 3



################
#We compare the Adjusted R squares of the models and also the AIC of the models we created to find the best one
ad_r_sq_f3
## [1] 0.830413
ad_r_sq_f4
## [1] 0.8161957
ad_r_sq_ma #BEST
## [1] 0.8337517
#The best Adkusted R square is the one in model a (the closer to 1 the better)
aic_f3
## [1] 2362.474
aic_f4
## [1] 2378.069
aic_ma #Best
## [1] 2350.119
#The best AIC and the best Adjusted R square is for model ma
#######################################################################################################
par(mfrow=c(2,2))
Actual_Revenues<- total_500_final_test$Revenues
plot (Actual_Revenues, col = "blue")
###########################################
predictions_ma <- predict(model_a,total_500_final_test)
plot (predictions_ma, col = "Red",main = "Model a")
#####################################
predictions_full3 <- predict(full_3,total_500_final_test)
plot (predictions_full3, col = "Red",main = "Full_3 model")
#####################################
predictions_full4 <- predict(full_4,total_500_final_test)
plot (predictions_full4, col = "Red",main = "Full_4 model")

#####################################
#From the plots above we can see that the actual Revenues have a more smooth way of leveling up except from the Revenues of the #1 ranking company that are extremely high in relationship with the other sites.
#The prediction model that is more smooth is the model a which has as we said before the best Adjusted R Square and the best AIC price
names(total_500_final_train)
## [1] "Revenues" "non.document.error" "number_of_errors"
## [4] "number_of_warning" "facebook" "instagram"
## [7] "linkedin" "pinterest" "twitter"
## [10] "youtube" "Flesh_Mesaure" "Readability"
## [13] "Sentences" "Unique.words" "Words"
## [16] "external" "internal" "total.links"
## [19] "X15x75" "X8x15" "X44x556"
## [22] "X1x1" "X800x1200" "X100x100"
## [25] "X24pxx133px" "X21pxx173px" "X46x214"
## [28] "X49x49" "X50x45" "X400x300"
## [31] "X292pxx292px" "X200pxx200px" "X1279pxx984px"
## [34] "X300pxx1500px" "X29x29" "X115x223"
## [37] "X160x233" "X300x993" "X41x192"
## [40] "X28x221" "X15x12" "X60x60"
## [43] ".bmp" ".dib" ".gif"
## [46] ".jpe" ".jpeg" ".jpg"
## [49] ".png" ".tif" ".tiff"
## [52] "total.images" "loading.time"
par(mfrow=c(1,1))
total_500_final_reg <- total_500_final_train[,c(1,41,21,30,43,20,10,53,27,45,3)]
cor(total_500_final_reg)
## Revenues X15x12 X44x556 X400x300
## Revenues 1.00000000 -0.866071521 -0.554995003 -0.810238376
## X15x12 -0.86607152 1.000000000 0.409636250 0.795640062
## X44x556 -0.55499500 0.409636250 1.000000000 0.514851212
## X400x300 -0.81023838 0.795640062 0.514851212 1.000000000
## .bmp 0.08823686 0.017894718 0.007330325 0.014237755
## X8x15 -0.48572735 0.333881354 0.815067889 0.419638691
## youtube 0.09933975 -0.034098547 -0.018043770 -0.059515452
## loading.time -0.12686096 0.086897868 0.030631302 0.079596684
## X46x214 -0.73936498 0.674879385 0.606976979 0.848221975
## .gif -0.04436402 0.002618064 0.030679347 0.008558755
## number_of_errors 0.07106309 -0.052737836 0.005969412 -0.058134366
## .bmp X8x15 youtube loading.time
## Revenues 0.088236861 -0.485727351 0.09933975 -0.12686096
## X15x12 0.017894718 0.333881354 -0.03409855 0.08689787
## X44x556 0.007330325 0.815067889 -0.01804377 0.03063130
## X400x300 0.014237755 0.419638691 -0.05951545 0.07959668
## .bmp 1.000000000 0.005974713 0.06081623 -0.03202094
## X8x15 0.005974713 1.000000000 0.01353035 0.07305785
## youtube 0.060816230 0.013530346 1.00000000 0.00711365
## loading.time -0.032020942 0.073057846 0.00711365 1.00000000
## X46x214 0.012076776 0.494727445 -0.05826545 0.04051586
## .gif -0.006020021 0.022359056 -0.01420548 0.01202806
## number_of_errors 0.005372103 0.026722692 -0.01898525 0.09290871
## X46x214 .gif number_of_errors
## Revenues -0.7393649844 -0.0443640226 0.071063094
## X15x12 0.6748793854 0.0026180642 -0.052737836
## X44x556 0.6069769787 0.0306793475 0.005969412
## X400x300 0.8482219754 0.0085587551 -0.058134366
## .bmp 0.0120767763 -0.0060200206 0.005372103
## X8x15 0.4947274449 0.0223590563 0.026722692
## youtube -0.0582654543 -0.0142054765 -0.018985254
## loading.time 0.0405158629 0.0120280632 0.092908709
## X46x214 1.0000000000 -0.0009473611 -0.038355173
## .gif -0.0009473611 1.0000000000 0.062343582
## number_of_errors -0.0383551731 0.0623435816 1.000000000
corrplot(cor(total_500_final_reg),method="number")

#We can see here that the variable x8x15 has a very high correlation with the variable x44x556 and also the variable X15x12 has also a very high correlation with the variable x400x300.
#So we can try creating a new model excluding the 2 variables that are correlated from each pair to see if there will be any improvement in the model
full_5 <- lm(Revenues~1 +X44x556 +X400x300 + .bmp + youtube +loading.time + X46x214 + .gif + number_of_errors ,data=total_500_final_train)
summary(full_5)
##
## Call:
## lm(formula = Revenues ~ 1 + X44x556 + X400x300 + .bmp + youtube +
## loading.time + X46x214 + .gif + number_of_errors, data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.584 -9.736 -5.298 0.999 97.593
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 207.33952 11.08956 18.697 < 2e-16 ***
## X44x556 -57.05604 13.88428 -4.109 5.22e-05 ***
## X400x300 -113.81359 11.08164 -10.270 < 2e-16 ***
## .bmp 2.37017 0.82413 2.876 0.00434 **
## youtube 3.34344 2.27623 1.469 0.14300
## loading.time -6.30802 3.05772 -2.063 0.04004 *
## X46x214 -18.96451 13.88890 -1.365 0.17321
## .gif -0.11152 0.10651 -1.047 0.29600
## number_of_errors 0.01489 0.01213 1.227 0.22072
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.98 on 279 degrees of freedom
## Multiple R-squared: 0.7029, Adjusted R-squared: 0.6943
## F-statistic: 82.5 on 8 and 279 DF, p-value: < 2.2e-16
adj_r_square_full5 <- summary(full_5)$adj.r.squared
aic_full5 <- AIC(full_5)
#We create the 2 basic plots so as to be able to explain the regression model
plot(full_5,which=1:3)



ad_r_sq_ma
## [1] 0.8337517
adj_r_square_full5
## [1] 0.6943474
aic_ma
## [1] 2350.119
aic_full5
## [1] 2523.573
#The adjusted R square and the aic are a little worse than before
#######################################################################################################
##################################################################################################
#Clustering
#Kmeans clustering
#Based on those results we will try to cluster the companies based on the results of the regression
set.seed(220)
clusters <- hclust(dist(total_500_final_reg[, 1]))
plot(clusters)

fortuneCluster <- kmeans(total_500_final_reg[, 1], 2, iter.max = 500,nstart = 1)
cluster <- table(fortuneCluster$cluster)
fortuneCluster$cluster <- as.factor(fortuneCluster$cluster)
ggplot(total_500_final_reg, aes(Revenues, loading.time, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, youtube, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, .gif, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, .bmp, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, X46x214, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, X15x12, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, X44x556, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, X400x300, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, X8x15, color = fortuneCluster$cluster)) + geom_point(size=3)

#From the clustering we can see that the variables do indeed devide the most high revenues from the smallest ones
summary(model_a)
##
## Call:
## lm(formula = Revenues ~ X15x12 + X44x556 + X400x300 + .bmp +
## X8x15 + youtube + loading.time + X46x214 + .gif + number_of_errors,
## data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.254 -7.695 -3.529 2.137 68.673
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 220.273625 9.934832 22.172 < 2e-16 ***
## X15x12 -87.818989 5.784528 -15.182 < 2e-16 ***
## X44x556 -29.578085 15.369922 -1.924 0.05533 .
## X400x300 -28.443747 9.923508 -2.866 0.00447 **
## .bmp 2.470545 0.607833 4.065 6.28e-05 ***
## X8x15 -41.311818 17.232973 -2.397 0.01718 *
## youtube 4.058630 1.681080 2.414 0.01641 *
## loading.time -4.490750 2.264113 -1.983 0.04830 *
## X46x214 -18.477039 10.243269 -1.804 0.07235 .
## .gif -0.120050 0.078556 -1.528 0.12760
## number_of_errors 0.013675 0.008952 1.528 0.12774
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14 on 277 degrees of freedom
## Multiple R-squared: 0.8395, Adjusted R-squared: 0.8338
## F-statistic: 144.9 on 10 and 277 DF, p-value: < 2.2e-16
#We can see from the model that the basic variable that effect a companys ranking
#is whether or not it has an image in size X15x12
#We will try to make a model that we will not take into consideration this variable at all just in order to see how it will explain the revenues
full_6 <- lm(Revenues~1 +X44x556 +X400x300 + .bmp + X8x15+ youtube +loading.time + X46x214 + .gif + number_of_errors ,data=total_500_final_train)
summary(full_6)
##
## Call:
## lm(formula = Revenues ~ 1 + X44x556 + X400x300 + .bmp + X8x15 +
## youtube + loading.time + X46x214 + .gif + number_of_errors,
## data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.941 -9.536 -5.194 0.764 97.740
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 220.52558 13.42298 16.429 < 2e-16 ***
## X44x556 -30.25871 20.76628 -1.457 0.14622
## X400x300 -113.89510 11.04234 -10.314 < 2e-16 ***
## .bmp 2.36934 0.82120 2.885 0.00422 **
## X8x15 -40.28865 23.28336 -1.730 0.08468 .
## youtube 3.53481 2.27084 1.557 0.12070
## loading.time -5.88761 3.05653 -1.926 0.05509 .
## X46x214 -18.85123 13.83968 -1.362 0.17426
## .gif -0.11274 0.10614 -1.062 0.28906
## number_of_errors 0.01555 0.01209 1.285 0.19971
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.91 on 278 degrees of freedom
## Multiple R-squared: 0.706, Adjusted R-squared: 0.6965
## F-statistic: 74.19 on 9 and 278 DF, p-value: < 2.2e-16
adj_r_square_full6 <- summary(full_6)$adj.r.squared
aic_full6 <- AIC(full_6)
#We create the 2 basic plots so as to be able to explain the regression model
plot(full_6,which=1:3)



predictions_ma <- predict(model_a,total_500_final_test)
Actual_Revenues<- total_500_final_test$Revenues
par(mfrow=c(2,2))
plot (Actual_Revenues, col = "blue")
plot (predictions_ma, col = "Red",main = "Model A")
#####################################
predictions_full_6 <- predict(full_6,total_500_final_test)
plot (predictions_full_6, col = "Red",main = "Full_6 model")
#######################################################

#We can see that here the prediction of the new model is not as good as the previous one so now that we have checked this option as well we can conclude that the most important factors are the ones of model_a
summary(model_a)
##
## Call:
## lm(formula = Revenues ~ X15x12 + X44x556 + X400x300 + .bmp +
## X8x15 + youtube + loading.time + X46x214 + .gif + number_of_errors,
## data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.254 -7.695 -3.529 2.137 68.673
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 220.273625 9.934832 22.172 < 2e-16 ***
## X15x12 -87.818989 5.784528 -15.182 < 2e-16 ***
## X44x556 -29.578085 15.369922 -1.924 0.05533 .
## X400x300 -28.443747 9.923508 -2.866 0.00447 **
## .bmp 2.470545 0.607833 4.065 6.28e-05 ***
## X8x15 -41.311818 17.232973 -2.397 0.01718 *
## youtube 4.058630 1.681080 2.414 0.01641 *
## loading.time -4.490750 2.264113 -1.983 0.04830 *
## X46x214 -18.477039 10.243269 -1.804 0.07235 .
## .gif -0.120050 0.078556 -1.528 0.12760
## number_of_errors 0.013675 0.008952 1.528 0.12774
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14 on 277 degrees of freedom
## Multiple R-squared: 0.8395, Adjusted R-squared: 0.8338
## F-statistic: 144.9 on 10 and 277 DF, p-value: < 2.2e-16